Step 1: Import data
Step 2: Collect initial data
Step 3: Describe data
Step 4: Verify data quality
Step 5: Explore data
(Hotz, 2023)
# Import library for reading .csv files
import pandas as pd # Data analysis and manipulation library
import re # Module used for string searching and manipulation
import io # Module used for reading and writing data
# (GeeksforGeeks, 2022)
# Read the CSV file
with open('TeleCom_Data-1.csv', 'r') as file:
csv_text = file.read()
# Define the separator
separator = ';'
# Handle special symbols to split in the csv into seperate columns using regular expressions
csv_text = re.sub(r'";', separator, csv_text)
csv_text = re.sub(r';"', separator, csv_text)
csv_text = re.sub(r'";"', separator, csv_text)
# Convert the csv into a dataframe using pandas
df = pd.read_csv(io.StringIO(csv_text), sep=separator)
# Display the first few rows of the dataframe
df.head()
#(Grabbing CSV Information With Regex in Python, n.d.)
| age;job | marital | education | default | housing | loan | contact | month | day_of_week | duration | campaign | pdays | previous | poutcome | emp.var.rate | cons.price.idx | cons.conf.idx | euribor3m | nr.employed | y""" | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 40;admin. | married | basic.6y | no | no | no | telephone | may | mon" | 151 | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no" |
| 1 | 56;services | married | high.school | no | no | yes | telephone | may | mon" | 307 | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no" |
| 2 | 45;services | married | basic.9y | unknown | no | no | telephone | may | mon" | 198 | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no" |
| 3 | 59;admin. | married | professional.course | no | no | no | telephone | may | mon" | 139 | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no" |
| 4 | 41;blue-collar | married | unknown | unknown | no | no | telephone | may | mon" | 217 | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no" |
# Split the 'age;job' column into two separate columns: 'age' and 'job'
df[['age', 'job']] = df["age;job"].str.split(';', expand=True)
# Drop the original 'age;job' column
df.drop(columns='age;job', inplace=True)
df.head()
| marital | education | default | housing | loan | contact | month | day_of_week | duration | campaign | ... | previous | poutcome | emp.var.rate | cons.price.idx | cons.conf.idx | euribor3m | nr.employed | y""" | age | job | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | married | basic.6y | no | no | no | telephone | may | mon" | 151 | 1 | ... | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no" | 40 | admin. |
| 1 | married | high.school | no | no | yes | telephone | may | mon" | 307 | 1 | ... | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no" | 56 | services |
| 2 | married | basic.9y | unknown | no | no | telephone | may | mon" | 198 | 1 | ... | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no" | 45 | services |
| 3 | married | professional.course | no | no | no | telephone | may | mon" | 139 | 1 | ... | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no" | 59 | admin. |
| 4 | married | unknown | unknown | no | no | telephone | may | mon" | 217 | 1 | ... | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no" | 41 | blue-collar |
5 rows × 21 columns
# Get rid of the special symbols on rows in certain columns
df['y"""'] = df['y"""'].str.replace('"', "")
df['day_of_week'] = df['day_of_week'].str.replace('"', "")
df.head()
| marital | education | default | housing | loan | contact | month | day_of_week | duration | campaign | ... | previous | poutcome | emp.var.rate | cons.price.idx | cons.conf.idx | euribor3m | nr.employed | y""" | age | job | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | married | basic.6y | no | no | no | telephone | may | mon | 151 | 1 | ... | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no | 40 | admin. |
| 1 | married | high.school | no | no | yes | telephone | may | mon | 307 | 1 | ... | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no | 56 | services |
| 2 | married | basic.9y | unknown | no | no | telephone | may | mon | 198 | 1 | ... | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no | 45 | services |
| 3 | married | professional.course | no | no | no | telephone | may | mon | 139 | 1 | ... | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no | 59 | admin. |
| 4 | married | unknown | unknown | no | no | telephone | may | mon | 217 | 1 | ... | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no | 41 | blue-collar |
5 rows × 21 columns
# Rename the target variable column to get rid of special symbols
df.rename(columns={'y"""': 'y'}, inplace=True)
df.head()
| marital | education | default | housing | loan | contact | month | day_of_week | duration | campaign | ... | previous | poutcome | emp.var.rate | cons.price.idx | cons.conf.idx | euribor3m | nr.employed | y | age | job | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | married | basic.6y | no | no | no | telephone | may | mon | 151 | 1 | ... | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no | 40 | admin. |
| 1 | married | high.school | no | no | yes | telephone | may | mon | 307 | 1 | ... | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no | 56 | services |
| 2 | married | basic.9y | unknown | no | no | telephone | may | mon | 198 | 1 | ... | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no | 45 | services |
| 3 | married | professional.course | no | no | no | telephone | may | mon | 139 | 1 | ... | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no | 59 | admin. |
| 4 | married | unknown | unknown | no | no | telephone | may | mon | 217 | 1 | ... | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no | 41 | blue-collar |
5 rows × 21 columns
# Display the shape and first 5 rows of the data
print(df.shape)
df.head()
(41180, 21)
| marital | education | default | housing | loan | contact | month | day_of_week | duration | campaign | ... | previous | poutcome | emp.var.rate | cons.price.idx | cons.conf.idx | euribor3m | nr.employed | y | age | job | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | married | basic.6y | no | no | no | telephone | may | mon | 151 | 1 | ... | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no | 40 | admin. |
| 1 | married | high.school | no | no | yes | telephone | may | mon | 307 | 1 | ... | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no | 56 | services |
| 2 | married | basic.9y | unknown | no | no | telephone | may | mon | 198 | 1 | ... | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no | 45 | services |
| 3 | married | professional.course | no | no | no | telephone | may | mon | 139 | 1 | ... | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no | 59 | admin. |
| 4 | married | unknown | unknown | no | no | telephone | may | mon | 217 | 1 | ... | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no | 41 | blue-collar |
5 rows × 21 columns
# Look at 5 random rows of the data
df.sample(5)
| marital | education | default | housing | loan | contact | month | day_of_week | duration | campaign | ... | previous | poutcome | emp.var.rate | cons.price.idx | cons.conf.idx | euribor3m | nr.employed | y | age | job | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 12604 | married | professional.course | no | no | no | cellular | jul | mon | 167 | 2 | ... | 0 | nonexistent | 1.4 | 93.918 | -42.7 | 4.960 | 5228.1 | no | 32 | technician |
| 32038 | married | basic.9y | no | no | no | cellular | may | thu | 369 | 1 | ... | 0 | nonexistent | -1.8 | 92.893 | -46.2 | 1.327 | 5099.1 | no | 56 | blue-collar |
| 17570 | single | university.degree | unknown | no | no | cellular | jul | mon | 644 | 3 | ... | 0 | nonexistent | 1.4 | 93.918 | -42.7 | 4.962 | 5228.1 | no | 39 | technician |
| 22989 | married | university.degree | no | no | no | cellular | aug | mon | 98 | 3 | ... | 0 | nonexistent | 1.4 | 93.444 | -36.1 | 4.965 | 5228.1 | no | 50 | admin. |
| 39122 | married | high.school | no | yes | no | cellular | dec | wed | 158 | 3 | ... | 0 | nonexistent | -3.0 | 92.713 | -33.0 | 0.700 | 5023.5 | no | 31 | student |
5 rows × 21 columns
# Check how many rows and columns, how many non-null values and datatype of the datasets
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 41180 entries, 0 to 41179 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 marital 41180 non-null object 1 education 41180 non-null object 2 default 41180 non-null object 3 housing 41180 non-null object 4 loan 41180 non-null object 5 contact 41180 non-null object 6 month 41180 non-null object 7 day_of_week 41180 non-null object 8 duration 41180 non-null int64 9 campaign 41180 non-null int64 10 pdays 41180 non-null int64 11 previous 41180 non-null int64 12 poutcome 41180 non-null object 13 emp.var.rate 41180 non-null float64 14 cons.price.idx 41180 non-null float64 15 cons.conf.idx 41180 non-null float64 16 euribor3m 41180 non-null float64 17 nr.employed 41180 non-null float64 18 y 41180 non-null object 19 age 41180 non-null object 20 job 41180 non-null object dtypes: float64(5), int64(4), object(12) memory usage: 6.6+ MB
# Get a quick statistical overview of the data
df.describe()
| duration | campaign | pdays | previous | emp.var.rate | cons.price.idx | cons.conf.idx | euribor3m | nr.employed | |
|---|---|---|---|---|---|---|---|---|---|
| count | 41180.000000 | 41180.000000 | 41180.000000 | 41180.000000 | 41180.000000 | 41180.000000 | 41180.000000 | 41180.000000 | 41180.000000 |
| mean | 258.280427 | 2.567800 | 962.516707 | 0.172705 | 0.081901 | 93.575508 | -40.501999 | 3.621422 | 5167.053344 |
| std | 259.299856 | 2.770225 | 186.809028 | 0.493719 | 1.571037 | 0.578762 | 4.627358 | 1.734385 | 72.230334 |
| min | 0.000000 | 1.000000 | 0.000000 | 0.000000 | -3.400000 | 92.201000 | -50.800000 | 0.634000 | 4963.600000 |
| 25% | 102.000000 | 1.000000 | 999.000000 | 0.000000 | -1.800000 | 93.075000 | -42.700000 | 1.344000 | 5099.100000 |
| 50% | 180.000000 | 2.000000 | 999.000000 | 0.000000 | 1.100000 | 93.749000 | -41.800000 | 4.857000 | 5191.000000 |
| 75% | 319.000000 | 3.000000 | 999.000000 | 0.000000 | 1.400000 | 93.994000 | -36.400000 | 4.961000 | 5228.100000 |
| max | 4918.000000 | 56.000000 | 999.000000 | 7.000000 | 1.400000 | 94.767000 | -26.900000 | 5.045000 | 5228.100000 |
# Check distribution of target variable
df['y'].value_counts()
no 36542 yes 4638 Name: y, dtype: int64
# Check for missing values to be sure
df.isnull().sum()
marital 0 education 0 default 0 housing 0 loan 0 contact 0 month 0 day_of_week 0 duration 0 campaign 0 pdays 0 previous 0 poutcome 0 emp.var.rate 0 cons.price.idx 0 cons.conf.idx 0 euribor3m 0 nr.employed 0 y 0 age 0 job 0 dtype: int64
# Check for duplicates
df.duplicated().sum()
12
# Group together and compare the duplicate and their original counterpart
duplicated_groups = df.groupby(list(df.columns)).filter(lambda x: len(x) > 1)
print(duplicated_groups)
# (So, 2023)
marital education default housing loan contact month \
1261 married basic.6y no no no telephone may
1262 married basic.6y no no no telephone may
12256 married unknown no no no telephone jul
12257 married unknown no no no telephone jul
14151 single professional.course no no no cellular jul
14230 single professional.course no no no cellular jul
16815 divorced high.school no yes no cellular jul
16952 divorced high.school no yes no cellular jul
18460 single professional.course no yes no cellular jul
18461 single professional.course no yes no cellular jul
20068 married high.school unknown no no cellular aug
20212 married high.school unknown no no cellular aug
20527 married professional.course no yes no cellular aug
20530 married professional.course no yes no cellular aug
25179 married university.degree no no no cellular nov
25213 married university.degree no no no cellular nov
28472 single high.school no yes no cellular apr
28473 single high.school no yes no cellular apr
32501 married university.degree no yes no cellular may
32512 married university.degree no yes no cellular may
36946 married university.degree no no no cellular jul
36947 married university.degree no no no cellular jul
38251 single university.degree no no no telephone oct
38277 single university.degree no no no telephone oct
day_of_week duration campaign ... previous poutcome \
1261 thu 124 1 ... 0 nonexistent
1262 thu 124 1 ... 0 nonexistent
12256 thu 88 1 ... 0 nonexistent
12257 thu 88 1 ... 0 nonexistent
14151 mon 331 2 ... 0 nonexistent
14230 mon 331 2 ... 0 nonexistent
16815 thu 43 3 ... 0 nonexistent
16952 thu 43 3 ... 0 nonexistent
18460 thu 128 1 ... 0 nonexistent
18461 thu 128 1 ... 0 nonexistent
20068 mon 33 1 ... 0 nonexistent
20212 mon 33 1 ... 0 nonexistent
20527 tue 127 1 ... 0 nonexistent
20530 tue 127 1 ... 0 nonexistent
25179 tue 123 2 ... 0 nonexistent
25213 tue 123 2 ... 0 nonexistent
28472 tue 114 1 ... 0 nonexistent
28473 tue 114 1 ... 0 nonexistent
32501 fri 348 4 ... 0 nonexistent
32512 fri 348 4 ... 0 nonexistent
36946 thu 252 1 ... 0 nonexistent
36947 thu 252 1 ... 0 nonexistent
38251 tue 120 1 ... 0 nonexistent
38277 tue 120 1 ... 0 nonexistent
emp.var.rate cons.price.idx cons.conf.idx euribor3m nr.employed \
1261 1.1 93.994 -36.4 4.855 5191.0
1262 1.1 93.994 -36.4 4.855 5191.0
12256 1.4 93.918 -42.7 4.966 5228.1
12257 1.4 93.918 -42.7 4.966 5228.1
14151 1.4 93.918 -42.7 4.962 5228.1
14230 1.4 93.918 -42.7 4.962 5228.1
16815 1.4 93.918 -42.7 4.962 5228.1
16952 1.4 93.918 -42.7 4.962 5228.1
18460 1.4 93.918 -42.7 4.968 5228.1
18461 1.4 93.918 -42.7 4.968 5228.1
20068 1.4 93.444 -36.1 4.965 5228.1
20212 1.4 93.444 -36.1 4.965 5228.1
20527 1.4 93.444 -36.1 4.966 5228.1
20530 1.4 93.444 -36.1 4.966 5228.1
25179 -0.1 93.200 -42.0 4.153 5195.8
25213 -0.1 93.200 -42.0 4.153 5195.8
28472 -1.8 93.075 -47.1 1.423 5099.1
28473 -1.8 93.075 -47.1 1.423 5099.1
32501 -1.8 92.893 -46.2 1.313 5099.1
32512 -1.8 92.893 -46.2 1.313 5099.1
36946 -2.9 92.469 -33.6 1.072 5076.2
36947 -2.9 92.469 -33.6 1.072 5076.2
38251 -3.4 92.431 -26.9 0.742 5017.5
38277 -3.4 92.431 -26.9 0.742 5017.5
y age job
1261 no 39 blue-collar
1262 no 39 blue-collar
12256 no 36 retired
12257 no 36 retired
14151 no 27 technician
14230 no 27 technician
16815 no 47 technician
16952 no 47 technician
18460 no 32 technician
18461 no 32 technician
20068 no 55 services
20212 no 55 services
20527 no 41 technician
20530 no 41 technician
25179 no 39 admin.
25213 no 39 admin.
28472 no 24 services
28473 no 24 services
32501 no 35 admin.
32512 no 35 admin.
36946 yes 45 admin.
36947 yes 45 admin.
38251 no 71 retired
38277 no 71 retired
[24 rows x 21 columns]
# Remove the duplicate rows while keeping the first occurence
df = df.drop_duplicates()
# Verify duplicates are removed
print(df.duplicated().sum()) # Should return 0 now
0
# General Summary
summary_df = df.describe(include='all').transpose()
# Display Summary
print(summary_df)
#(So, 2023)
count unique top freq mean \
marital 41168 4 married 24914 NaN
education 41168 8 university.degree 12162 NaN
default 41168 3 no 32570 NaN
housing 41168 3 yes 21566 NaN
loan 41168 3 no 33931 NaN
contact 41168 2 cellular 26131 NaN
month 41168 10 may 13763 NaN
day_of_week 41168 5 thu 8617 NaN
duration 41168.0 NaN NaN NaN 258.311237
campaign 41168.0 NaN NaN NaN 2.568087
pdays 41168.0 NaN NaN NaN 962.506073
previous 41168.0 NaN NaN NaN 0.172756
poutcome 41168 3 nonexistent 35547 NaN
emp.var.rate 41168.0 NaN NaN NaN 0.081937
cons.price.idx 41168.0 NaN NaN NaN 93.575563
cons.conf.idx 41168.0 NaN NaN NaN -40.502261
euribor3m 41168.0 NaN NaN NaN 3.621425
nr.employed 41168.0 NaN NaN NaN 5167.052308
y 41168 2 no 36531 NaN
age 41168 78 31 1947 NaN
job 41168 12 admin. 10419 NaN
std min 25% 50% 75% max
marital NaN NaN NaN NaN NaN NaN
education NaN NaN NaN NaN NaN NaN
default NaN NaN NaN NaN NaN NaN
housing NaN NaN NaN NaN NaN NaN
loan NaN NaN NaN NaN NaN NaN
contact NaN NaN NaN NaN NaN NaN
month NaN NaN NaN NaN NaN NaN
day_of_week NaN NaN NaN NaN NaN NaN
duration 259.325938 0.0 102.0 180.0 319.0 4918.0
campaign 2.77053 1.0 1.0 2.0 3.0 56.0
pdays 186.835214 0.0 999.0 999.0 999.0 999.0
previous 0.493782 0.0 0.0 0.0 0.0 7.0
poutcome NaN NaN NaN NaN NaN NaN
emp.var.rate 1.57096 -3.4 -1.8 1.1 1.4 1.4
cons.price.idx 0.578761 92.201 93.075 93.749 93.994 94.767
cons.conf.idx 4.62702 -50.8 -42.7 -41.8 -36.4 -26.9
euribor3m 1.734375 0.634 1.344 4.857 4.961 5.045
nr.employed 72.230165 4963.6 5099.1 5191.0 5228.1 5228.1
y NaN NaN NaN NaN NaN NaN
age NaN NaN NaN NaN NaN NaN
job NaN NaN NaN NaN NaN NaN
import dataprep # Library used to simplify data preparation
from dataprep.eda import create_report # Function from dataprep used for exploratory data analysis
# Data Profiling using dataprep
report = create_report(df, title="Telecom Data EDA")
display(report)
#(Dong, 2023)
#(Dataprep.eda Inside Google Colab, n.d.)
0%| | 0/2756 [00:00<…
C:\Users\ans_b\anaconda3\lib\site-packages\dask\core.py:119: RuntimeWarning: invalid value encountered in divide
return func(*(_execute_task(a, cache) for a in args))
C:\Users\ans_b\anaconda3\lib\site-packages\dataprep\eda\distribution\render.py:274: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
df = df.append(pd.DataFrame({col: [nrows - npresent]}, index=["Others"]))
C:\Users\ans_b\anaconda3\lib\site-packages\dataprep\eda\distribution\render.py:274: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
df = df.append(pd.DataFrame({col: [nrows - npresent]}, index=["Others"]))